Back to Article
Get all paragraphs
Download Notebook
In [1]:
%pip install pandas-datareader
Requirement already satisfied: pandas-datareader in c:\users\shuos\anaconda3\lib\site-packages (0.10.0)Note: you may need to restart the kernel to use updated packages.

Requirement already satisfied: lxml in c:\users\shuos\anaconda3\lib\site-packages (from pandas-datareader) (5.2.1)
Requirement already satisfied: pandas>=0.23 in c:\users\shuos\anaconda3\lib\site-packages (from pandas-datareader) (2.2.2)
Requirement already satisfied: requests>=2.19.0 in c:\users\shuos\anaconda3\lib\site-packages (from pandas-datareader) (2.32.2)
Requirement already satisfied: numpy>=1.26.0 in c:\users\shuos\anaconda3\lib\site-packages (from pandas>=0.23->pandas-datareader) (1.26.4)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\shuos\anaconda3\lib\site-packages (from pandas>=0.23->pandas-datareader) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in c:\users\shuos\anaconda3\lib\site-packages (from pandas>=0.23->pandas-datareader) (2024.1)
Requirement already satisfied: tzdata>=2022.7 in c:\users\shuos\anaconda3\lib\site-packages (from pandas>=0.23->pandas-datareader) (2023.3)
Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\shuos\anaconda3\lib\site-packages (from requests>=2.19.0->pandas-datareader) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in c:\users\shuos\anaconda3\lib\site-packages (from requests>=2.19.0->pandas-datareader) (3.7)
Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\shuos\anaconda3\lib\site-packages (from requests>=2.19.0->pandas-datareader) (2.2.2)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\shuos\anaconda3\lib\site-packages (from requests>=2.19.0->pandas-datareader) (2024.8.30)
Requirement already satisfied: six>=1.5 in c:\users\shuos\anaconda3\lib\site-packages (from python-dateutil>=2.8.2->pandas>=0.23->pandas-datareader) (1.16.0)
In [2]:
%pip install BeautifulSoup
Collecting BeautifulSoupNote: you may need to restart the kernel to use updated packages.

  Using cached BeautifulSoup-3.2.2.tar.gz (32 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'
  error: subprocess-exited-with-error
  
  × python setup.py egg_info did not run successfully.
  │ exit code: 1
  ╰─> [7 lines of output]
      Traceback (most recent call last):
        File "<string>", line 2, in <module>
        File "<pip-setuptools-caller>", line 34, in <module>
        File "C:\Users\shuos\AppData\Local\Temp\pip-install-y827_syz\beautifulsoup_ba2a3fc17f69417f98e1bc3e12db11f6\setup.py", line 3
          "You're trying to run a very old release of Beautiful Soup under Python 3. This will not work."<>"Please use Beautiful Soup 4, available through the pip package 'beautifulsoup4'."
                                                                                                         ^^
      SyntaxError: invalid syntax
      [end of output]
  
  note: This error originates from a subprocess, and is likely not a problem with pip.
error: metadata-generation-failed

× Encountered error while generating package metadata.
╰─> See above for output.

note: This is an issue with the package mentioned above, not pip.
hint: See above for details.
In [3]:
%pip install bs4
Requirement already satisfied: bs4 in c:\users\shuos\anaconda3\lib\site-packages (0.0.2)
Requirement already satisfied: beautifulsoup4 in c:\users\shuos\anaconda3\lib\site-packages (from bs4) (4.12.3)
Requirement already satisfied: soupsieve>1.2 in c:\users\shuos\anaconda3\lib\site-packages (from beautifulsoup4->bs4) (2.5)
Note: you may need to restart the kernel to use updated packages.
In [4]:
import matplotlib.pyplot as plt
import pandas as pd
import requests
from bs4 import BeautifulSoup
import textwrap
In [5]:
pd.read_csv(
    "https://vincentarelbundock.github.io/Rdatasets/csv/dplyr/storms.csv", nrows=10
)
rownames name year month day hour lat long status category wind pressure tropicalstorm_force_diameter hurricane_force_diameter
0 1 Amy 1975 6 27 0 27.5 -79.0 tropical depression NaN 25 1013 NaN NaN
1 2 Amy 1975 6 27 6 28.5 -79.0 tropical depression NaN 25 1013 NaN NaN
2 3 Amy 1975 6 27 12 29.5 -79.0 tropical depression NaN 25 1013 NaN NaN
3 4 Amy 1975 6 27 18 30.5 -79.0 tropical depression NaN 25 1013 NaN NaN
4 5 Amy 1975 6 28 0 31.5 -78.8 tropical depression NaN 25 1012 NaN NaN
5 6 Amy 1975 6 28 6 32.4 -78.7 tropical depression NaN 25 1012 NaN NaN
6 7 Amy 1975 6 28 12 33.3 -78.0 tropical depression NaN 25 1011 NaN NaN
7 8 Amy 1975 6 28 18 34.0 -77.0 tropical depression NaN 30 1006 NaN NaN
8 9 Amy 1975 6 29 0 34.4 -75.8 tropical storm NaN 35 1004 NaN NaN
9 10 Amy 1975 6 29 6 34.0 -74.8 tropical storm NaN 40 1002 NaN NaN
In [6]:
url = "http://aeturrell.com/research"
page = requests.get(url)
page.text[:300]
'<!DOCTYPE html>\n<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>\n\n<meta charset="utf-8">\n<meta name="generator" content="quarto-1.5.56">\n\n<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">\n\n<meta name="author" content="Arthur Turrell">\n'
In [7]:
soup = BeautifulSoup(page.text, "html.parser")
print(soup.prettify()[60000:60500])
       </div>
          <div class="project-category">
           <a href="#category=gender pay gap">
            gender pay gap
           </a>
          </div>
          <div class="project-category">
           <a href="#category=labour">
            labour
           </a>
          </div>
          <div class="project-category">
           <a href="#category=text analysis">
            text analysis
           </a>
          </div>
         </div>
         <div class="project-details-listing
In [8]:
# Get all paragraphs
all_paras = soup.find_all("p")
# Just show one of the paras
all_paras[1]
<p>Botta, Federico, Robin Lovelace, Laura Gilbert, and Arthur Turrell. "Packaging code and data for reproducible research: A case study of journey time statistics." <i>Environment and Planning B: Urban Analytics and City Science</i> (2024): 23998083241267331. doi: <a href="https://doi.org/10.1177/23998083241267331"><code>10.1177/23998083241267331</code></a></p>
In [9]:
all_paras[1].text
'Botta, Federico, Robin Lovelace, Laura Gilbert, and Arthur Turrell. "Packaging code and data for reproducible research: A case study of journey time statistics." Environment and Planning B: Urban Analytics and City Science (2024): 23998083241267331. doi: 10.1177/23998083241267331'
In [10]:
projects = soup.find_all("div", class_="project-content listing-pub-info")
projects = [x.text.strip() for x in projects]
projects[:4]
['Botta, Federico, Robin Lovelace, Laura Gilbert, and Arthur Turrell. "Packaging code and data for reproducible research: A case study of journey time statistics." Environment and Planning B: Urban Analytics and City Science (2024): 23998083241267331. doi: 10.1177/23998083241267331',
 'Kalamara, Eleni, Arthur Turrell, Chris Redl, George Kapetanios, and Sujit Kapadia. "Making text count: economic forecasting using newspaper text." Journal of Applied Econometrics 37, no. 5 (2022): 896-919. doi: 10.1002/jae.2907',
 'Turrell, A., Speigner, B., Copple, D., Djumalieva, J. and Thurgood, J., 2021. Is the UK’s productivity puzzle mostly driven by occupational mismatch? An analysis using big data on job vacancies. Labour Economics, 71, p.102013. doi: 10.1016/j.labeco.2021.102013',
 'Haldane, Andrew G., and Arthur E. Turrell. "Drawing on different disciplines: macroeconomic agent-based models." Journal of Evolutionary Economics 29 (2019): 39-66. doi: 10.1007/s00191-018-0557-5']
In [11]:
df_list = pd.read_html(
    "https://simple.wikipedia.org/wiki/FIFA_World_Cup", match="Sweden"
)
# Retrieve first and only entry from list of dataframes
df = df_list[0]
df.head()
Years Hosts Winners Score Runner's-up Third place Score.1 Fourth place
0 1930 Details Uruguay Uruguay 4 - 2 Argentina United States [note 1] Yugoslavia
1 1934 Details Italy Italy 2 - 1 Czechoslovakia Germany 3 - 2 Austria
2 1938 Details France Italy 4 - 2 Hungary Brazil 4 - 2 Sweden
3 1950 Details Brazil Uruguay 2 - 1 Brazil Sweden [note 2] Spain
4 1954 Details Switzerland West Germany 3 - 2 Hungary Austria 3 - 1 Uruguay
In [12]:
%pip install pdftotext
Collecting pdftotext
  Using cached pdftotext-2.2.2.tar.gz (113 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: pdftotext
  Building wheel for pdftotext (setup.py): started
  Building wheel for pdftotext (setup.py): finished with status 'error'
  Running setup.py clean for pdftotext
Failed to build pdftotext
Note: you may need to restart the kernel to use updated packages.
  error: subprocess-exited-with-error
  
  × python setup.py bdist_wheel did not run successfully.
  │ exit code: 1
  ╰─> [11 lines of output]
      WARNING: pkg-config not found--guessing at poppler version.
               If the build fails, install pkg-config and try again.
      WARNING: pkg-config not found--guessing at poppler version.
               If the build fails, install pkg-config and try again.
      WARNING: pkg-config not found--guessing at poppler version.
               If the build fails, install pkg-config and try again.
      running bdist_wheel
      running build
      running build_ext
      building 'pdftotext' extension
      error: Microsoft Visual C++ 14.0 or greater is required. Get it with "Microsoft C++ Build Tools": https://visualstudio.microsoft.com/visual-cpp-build-tools/
      [end of output]
  
  note: This error originates from a subprocess, and is likely not a problem with pip.
  ERROR: Failed building wheel for pdftotext
ERROR: Could not build wheels for pdftotext, which is required to install pyproject.toml-based projects
In [13]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
In [14]:
# Downloading imdb top 250 movie's data
url = 'http://www.imdb.com/chart/top'
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
In [15]:
movies = soup.select('td.titleColumn')
crew = [a.attrs.get('title') for a in soup.select('td.titleColumn a')]
ratings = [b.attrs.get('data-value')
        for b in soup.select('td.posterColumn span[name=ir]')]
In [16]:
# create a empty list for storing
# movie information
list = []

# Iterating over movies to extract
# each movie's details
for index in range(0, len(movies)):
    
    # Separating movie into: 'place',
    # 'title', 'year'
    movie_string = movies[index].get_text()
    movie = (' '.join(movie_string.split()).replace('.', ''))
    movie_title = movie[len(str(index))+1:-7]
    year = re.search('\((.*?)\)', movie_string).group(1)
    place = movie[:len(str(index))-(len(movie))]
    data = {"place": place,
            "movie_title": movie_title,
            "rating": ratings[index],
            "year": year,
            "star_cast": crew[index],
            }
    list.append(data)
<>:14: SyntaxWarning: invalid escape sequence '\('
<>:14: SyntaxWarning: invalid escape sequence '\('
C:\Users\shuos\AppData\Local\Temp\ipykernel_6060\3116574631.py:14: SyntaxWarning: invalid escape sequence '\('
  year = re.search('\((.*?)\)', movie_string).group(1)
In [17]:
for movie in list:
    print(movie['place'], '-', movie['movie_title'], '('+movie['year'] +
        ') -', 'Starring:', movie['star_cast'], movie['rating'])
In [18]:
#saving the list as dataframe
#then converting into .csv file
df = pd.DataFrame(list)
df.to_csv('imdb_top_250_movies.csv',index=False)
In [19]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd


# Downloading imdb top 250 movie's data
url = 'http://www.imdb.com/chart/top'
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
movies = soup.select('td.titleColumn')
crew = [a.attrs.get('title') for a in soup.select('td.titleColumn a')]
ratings = [b.attrs.get('data-value')
        for b in soup.select('td.posterColumn span[name=ir]')]




# create a empty list for storing
# movie information
list = []

# Iterating over movies to extract
# each movie's details
for index in range(0, len(movies)):
    
    # Separating movie into: 'place',
    # 'title', 'year'
    movie_string = movies[index].get_text()
    movie = (' '.join(movie_string.split()).replace('.', ''))
    movie_title = movie[len(str(index))+1:-7]
    year = re.search('\((.*?)\)', movie_string).group(1)
    place = movie[:len(str(index))-(len(movie))]
    data = {"place": place,
            "movie_title": movie_title,
            "rating": ratings[index],
            "year": year,
            "star_cast": crew[index],
            }
    list.append(data)

# printing movie details with its rating.
for movie in list:
    print(movie['place'], '-', movie['movie_title'], '('+movie['year'] +
        ') -', 'Starring:', movie['star_cast'], movie['rating'])


##.......##
df = pd.DataFrame(list)
df.to_csv('imdb_top_250_movies.csv',index=False)
<>:32: SyntaxWarning: invalid escape sequence '\('
<>:32: SyntaxWarning: invalid escape sequence '\('
C:\Users\shuos\AppData\Local\Temp\ipykernel_6060\3772066867.py:32: SyntaxWarning: invalid escape sequence '\('
  year = re.search('\((.*?)\)', movie_string).group(1)
In [20]:
%pip install requests
Requirement already satisfied: requests in c:\users\shuos\anaconda3\lib\site-packages (2.32.2)
Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\shuos\anaconda3\lib\site-packages (from requests) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in c:\users\shuos\anaconda3\lib\site-packages (from requests) (3.7)
Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\shuos\anaconda3\lib\site-packages (from requests) (2.2.2)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\shuos\anaconda3\lib\site-packages (from requests) (2024.8.30)
Note: you may need to restart the kernel to use updated packages.
In [21]:
%pip install beautifulsoup4
Requirement already satisfied: beautifulsoup4 in c:\users\shuos\anaconda3\lib\site-packages (4.12.3)
Requirement already satisfied: soupsieve>1.2 in c:\users\shuos\anaconda3\lib\site-packages (from beautifulsoup4) (2.5)
Note: you may need to restart the kernel to use updated packages.
In [22]:
import requests
 
# 定义请求的 URL 和 headers
url = "https://movie.douban.com/top250"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
 
# 发送 GET 请求
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'  # 设置编码方式
html_content = response.text  # 获取网页的 HTML 内容
print("网页内容加载成功!")
网页内容加载成功!
In [23]:
from bs4 import BeautifulSoup
 
# 使用 Beautiful Soup 解析 HTML
soup = BeautifulSoup(html_content, 'html.parser')
 
# 提取电影名称、描述、评分和评价人数
movies = []
for item in soup.find_all('div', class_='item'):
    title = item.find('span', class_='title').get_text()  # 电影名称
    description = item.find('span', class_='inq')  # 电影描述
    rating = item.find('span', class_='rating_num').get_text()  # 评分
    votes = item.find('div', class_='star').find_all('span')[3].get_text()  # 评价人数
    
    # 如果没有描述,将其置为空字符串
    if description:
        description = description.get_text()
    else:
        description = ''
    
    movie = {
        "title": title,
        "description": description,
        "rating": rating,
        "votes": votes.replace('人评价', '').strip()
    }
    movies.append(movie)
 
print("数据提取成功!")
数据提取成功!
In [24]:
import csv
 
# 将数据保存到 CSV 文件
with open('douban_top250.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['title', 'description', 'rating', 'votes']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 
    writer.writeheader()  # 写入表头
    for movie in movies:
        writer.writerow(movie)  # 写入每一行数据
 
print("数据已成功保存到 douban_top250.csv")
数据已成功保存到 douban_top250.csv
In [25]:
import requests
from bs4 import BeautifulSoup
import csv
 
# 定义请求的 URL 和 headers
url = "https://movie.douban.com/top250"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
 
# 发送 GET 请求
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'  # 设置编码方式
html_content = response.text  # 获取网页的 HTML 内容
 
# 使用 Beautiful Soup 解析 HTML
soup = BeautifulSoup(html_content, 'html.parser')
 
# 提取电影名称、描述、评分和评价人数
movies = []
for item in soup.find_all('div', class_='item'):
    title = item.find('span', class_='title').get_text()  # 电影名称
    description = item.find('span', class_='inq')  # 电影描述
    rating = item.find('span', class_='rating_num').get_text()  # 评分
    votes = item.find('div', class_='star').find_all('span')[3].get_text()  # 评价人数
    
    # 如果没有描述,将其置为空字符串
    if description:
        description = description.get_text()
    else:
        description = ''
    
    movie = {
        "title": title,
        "description": description,
        "rating": rating,
        "votes": votes.replace('人评价', '').strip()
    }
    movies.append(movie)
 
# 将数据保存到 CSV 文件
with open('douban_top250.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['title', 'description', 'rating', 'votes']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 
    writer.writeheader()  # 写入表头
    for movie in movies:
        writer.writerow(movie)  # 写入每一行数据
 
print("数据已成功保存到 douban_top250.csv")
数据已成功保存到 douban_top250.csv